# Author: Stephen Situ
# Logistic Regression models the probability of an event taking place by having the log odds be a linear combination
# of the features. They are fitted into a sigmoid function sigma(x) = (1/(1+exp(-x)) where the output can range from 0 to 1.
# This is useful for predicting binary outcomes by defining thresholds (usually 0-0.499 & 0.500-1).
# We take a sample of breast cancer data that is diagnosed as benign ("B") or malignant ("M") (cancer) and train 
# a logistic regression model. 
# Original dataset: https://www.kaggle.com/datasets/vijayaadithyanvg/breast-cancer-prediction


# Import Libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


# read CSV
breast_cancer_data = pd.read_csv("data.csv")


# Head
breast_cancer_data.head

<bound method NDFrame.head of            id diagnosis  Radius_mean  Texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         21.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0            0.11840           0.27760         0.30010              0.14710   
1            0.08474           0.07864         0.08690              0.07017   
2            0.10960           0.15990         0.19740              0.12790   
3            0.14250           0.28390         0.24140              0.10520   
4            0.10030           0.13280         0.19800              0.10430   
..               ...               ...             ...                  ...   
564          0.11100           0.11590         0.24390              0.13890   
565          0.09780           0.10340         0.14400              0.09791   
566          0.08455           0.10230         0.09251              0.05302   
567          0.11780           0.27700         0.35140              0.15200   
568          0.05263           0.04362         0.00000              0.00000   

     ...  radius_worst  texture_worst  perimeter_worst  area_worst  \
0    ...        25.380          17.33           184.60      2019.0   
1    ...        24.990          23.41           158.80      1956.0   
2    ...        23.570          25.53           152.50      1709.0   
3    ...        14.910          26.50            98.87       567.7   
4    ...        22.540          16.67           152.20      1575.0   
..   ...           ...            ...              ...         ...   
564  ...        25.450          26.40           166.10      2027.0   
565  ...        23.690          38.25           155.00      1731.0   
566  ...        18.980          34.12           126.70      1124.0   
567  ...        25.740          39.42           184.60      1821.0   
568  ...         9.456          30.37            59.16       268.6   

     smoothness_worst  compactness_worst  concavity_worst  \
0             0.16220            0.66560           0.7119   
1             0.12380            0.18660           0.2416   
2             0.14440            0.42450           0.4504   
3             0.20980            0.86630           0.6869   
4             0.13740            0.20500           0.4000   
..                ...                ...              ...   
564           0.14100            0.21130           0.4107   
565           0.11660            0.19220           0.3215   
566           0.11390            0.30940           0.3403   
567           0.16500            0.86810           0.9387   
568           0.08996            0.06444           0.0000   

     concave points_worst  symmetry_worst  fractal_dimension_worst  
0                  0.2654          0.4601                  0.11890  
1                  0.1860          0.2750                  0.08902  
2                  0.2430          0.3613                  0.08758  
3                  0.2575          0.6638                  0.17300  
4                  0.1625          0.2364                  0.07678  
..                    ...             ...                      ...  
564                0.2216          0.2060                  0.07115  
565                0.1628          0.2572                  0.06637  
566                0.1418          0.2218                  0.07820  
567                0.2650          0.4087                  0.12400  
568                0.0000          0.2871                  0.07039  

[569 rows x 32 columns]>


# describe
breast_cancer_data.describe

<bound method NDFrame.describe of            id diagnosis  Radius_mean  Texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         21.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0            0.11840           0.27760         0.30010              0.14710   
1            0.08474           0.07864         0.08690              0.07017   
2            0.10960           0.15990         0.19740              0.12790   
3            0.14250           0.28390         0.24140              0.10520   
4            0.10030           0.13280         0.19800              0.10430   
..               ...               ...             ...                  ...   
564          0.11100           0.11590         0.24390              0.13890   
565          0.09780           0.10340         0.14400              0.09791   
566          0.08455           0.10230         0.09251              0.05302   
567          0.11780           0.27700         0.35140              0.15200   
568          0.05263           0.04362         0.00000              0.00000   

     ...  radius_worst  texture_worst  perimeter_worst  area_worst  \
0    ...        25.380          17.33           184.60      2019.0   
1    ...        24.990          23.41           158.80      1956.0   
2    ...        23.570          25.53           152.50      1709.0   
3    ...        14.910          26.50            98.87       567.7   
4    ...        22.540          16.67           152.20      1575.0   
..   ...           ...            ...              ...         ...   
564  ...        25.450          26.40           166.10      2027.0   
565  ...        23.690          38.25           155.00      1731.0   
566  ...        18.980          34.12           126.70      1124.0   
567  ...        25.740          39.42           184.60      1821.0   
568  ...         9.456          30.37            59.16       268.6   

     smoothness_worst  compactness_worst  concavity_worst  \
0             0.16220            0.66560           0.7119   
1             0.12380            0.18660           0.2416   
2             0.14440            0.42450           0.4504   
3             0.20980            0.86630           0.6869   
4             0.13740            0.20500           0.4000   
..                ...                ...              ...   
564           0.14100            0.21130           0.4107   
565           0.11660            0.19220           0.3215   
566           0.11390            0.30940           0.3403   
567           0.16500            0.86810           0.9387   
568           0.08996            0.06444           0.0000   

     concave points_worst  symmetry_worst  fractal_dimension_worst  
0                  0.2654          0.4601                  0.11890  
1                  0.1860          0.2750                  0.08902  
2                  0.2430          0.3613                  0.08758  
3                  0.2575          0.6638                  0.17300  
4                  0.1625          0.2364                  0.07678  
..                    ...             ...                      ...  
564                0.2216          0.2060                  0.07115  
565                0.1628          0.2572                  0.06637  
566                0.1418          0.2218                  0.07820  
567                0.2650          0.4087                  0.12400  
568                0.0000          0.2871                  0.07039  

[569 rows x 32 columns]>


# dtypes
breast_cancer_data.dtypes

id                           int64
diagnosis                   object
Radius_mean                float64
Texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
dtype: object


# Change id to category
breast_cancer_data["id"] = breast_cancer_data["id"].astype("category")


# Quick Scatterplot Visualization
ax = sns.scatterplot(x='Radius_mean', y='perimeter_mean', data=breast_cancer_data, hue='diagnosis')
ax.set(xlabel='Mean Radius', ylabel='Mean Perimeter', title='Scatterplot of Breast Cancer Diagnosis')

[Text(0.5, 0, 'Mean Radius'),
 Text(0, 0.5, 'Mean Perimeter'),
 Text(0.5, 1.0, 'Scatterplot of Breast Cancer Diagnosis')]


# Hot encode categorical variable
bcd = pd.get_dummies(breast_cancer_data.drop(['id'],axis=1))
bcd1 = bcd.drop(['diagnosis_B'],axis=1)
bcd1


# Do train test split using 80/20 split and x & y
train_data, test_data = train_test_split(bcd1, test_size=0.2)
train_data_y = train_data['diagnosis_M']
train_data_x = train_data.drop(columns=['diagnosis_M'])
test_data_y = test_data['diagnosis_M']
test_data_x = test_data.drop(columns=['diagnosis_M'])


# Preform Logistic Regression
log_reg = LogisticRegression(max_iter=3000)
log_reg.fit(train_data_x,train_data_y)

LogisticRegression(max_iter=3000)


# Predict on test data 
y_pred = log_reg.predict(test_data_x)


# Confusion Matrix gives us 95.6% accuracy
confusion_matrix(y_pred,test_data_y)

array([[68,  3],
       [ 2, 41]], dtype=int64)


# Create new dataframe to visualize accuracy
bcd2 = test_data
bcd2["diagnosis_pred"] = y_pred
bcd2.loc[bcd2['diagnosis_M'] == bcd2['diagnosis_pred'], 'Accuracy'] = 'Correct'
bcd2.loc[bcd2['diagnosis_M'] != bcd2['diagnosis_pred'], 'Accuracy'] = 'Incorrect'
bcd2.loc[bcd2['diagnosis_M'] == 1, 'Diagnosis_true'] = 'M'
bcd2.loc[bcd2['diagnosis_M'] != 1, 'Diagnosis_true'] = 'B'


# Cast columns as categorical
bcd2["Accuracy"] = bcd2["Accuracy"].astype("category")
bcd2["Diagnosis_true"] = bcd2["Diagnosis_true"].astype("category")


# Quick view
bcd2


# Create scatter plot to visualize result
gx = sns.scatterplot(x='Radius_mean', y='perimeter_mean', data=bcd2, hue='Accuracy',style="Diagnosis_true")
gx.set(xlabel='Mean Radius', ylabel='Mean Perimeter', title='Scatterplot of Breast Cancer Logistic Regression Prediction On Test Data')

[Text(0.5, 0, 'Mean Radius'),
 Text(0, 0.5, 'Mean Perimeter'),
 Text(0.5, 1.0, 'Scatterplot of Breast Cancer Logistic Regression Prediction On Test Data')]


# Other Parameters
print('Intercept is', log_reg.intercept_)
print('Coefficients are', log_reg.coef_)

Intercept is [-23.06487791]
Coefficients are [[-0.89510767 -0.19950075  0.19813797 -0.01709282  0.15074503  0.18661899
   0.48322892  0.24736118  0.25028976  0.02861489  0.05131253 -0.94746801
   0.1976007   0.08442126  0.02079837 -0.03942007  0.06354866  0.03564237
   0.03529912 -0.01156539 -0.48316096  0.415974    0.12568589  0.01491001
   0.29112951  0.69990637  1.48231777  0.50418023  0.67023358  0.09829928]]

	Radius_mean	Texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	...	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	diagnosis_M	diagnosis_pred	Accuracy	Diagnosis_true
166	10.80	9.71	68.77	357.6	0.09594	0.05736	0.025310	0.016980	0.1381	0.06400	...	0.14360	0.12570	0.10470	0.04603	0.2090	0.07699	0	0	Correct	B
482	13.47	14.06	87.32	546.3	0.10710	0.11550	0.057860	0.052660	0.1779	0.06639	...	0.13930	0.24990	0.18480	0.13350	0.3227	0.09326	0	0	Correct	B
212	28.11	18.47	188.50	2499.0	0.11420	0.15160	0.320100	0.159500	0.1648	0.05525	...	0.11420	0.15160	0.32010	0.15950	0.1648	0.05525	1	1	Correct	M
562	15.22	30.62	103.40	716.9	0.10480	0.20870	0.255000	0.094290	0.2128	0.07152	...	0.14170	0.79170	1.17000	0.23560	0.4089	0.14090	1	1	Correct	M
510	11.74	14.69	76.31	426.0	0.08099	0.09661	0.067260	0.026390	0.1499	0.06758	...	0.10730	0.27930	0.26900	0.10560	0.2604	0.09879	0	0	Correct	B
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
157	16.84	19.46	108.40	880.2	0.07445	0.07223	0.051500	0.027710	0.1844	0.05268	...	0.08774	0.17100	0.18820	0.08436	0.2527	0.05972	0	1	Incorrect	B
296	10.91	12.35	69.14	363.7	0.08518	0.04721	0.012360	0.013690	0.1449	0.06031	...	0.09312	0.07506	0.02884	0.03194	0.2143	0.06643	0	0	Correct	B
396	13.51	18.89	88.10	558.1	0.10590	0.11470	0.085800	0.053810	0.1806	0.06079	...	0.14280	0.25700	0.34380	0.14530	0.2666	0.07686	0	0	Correct	B
334	12.30	19.02	77.88	464.4	0.08313	0.04202	0.007756	0.008535	0.1539	0.05945	...	0.12220	0.09052	0.03619	0.03983	0.2554	0.07207	0	0	Correct	B
495	14.87	20.21	96.12	680.9	0.09587	0.08345	0.068240	0.049510	0.1487	0.05748	...	0.12160	0.13880	0.17000	0.10170	0.2369	0.06599	0	0	Correct	B

	Radius_mean	Texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	diagnosis_M
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	0.2419	0.07871	...	17.33	184.60	2019.0	0.16220	0.66560	0.7119	0.2654	0.4601	0.11890	1
1	20.57	21.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	0.1812	0.05667	...	23.41	158.80	1956.0	0.12380	0.18660	0.2416	0.1860	0.2750	0.08902	1
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	0.2069	0.05999	...	25.53	152.50	1709.0	0.14440	0.42450	0.4504	0.2430	0.3613	0.08758	1
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	0.2597	0.09744	...	26.50	98.87	567.7	0.20980	0.86630	0.6869	0.2575	0.6638	0.17300	1
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	0.1809	0.05883	...	16.67	152.20	1575.0	0.13740	0.20500	0.4000	0.1625	0.2364	0.07678	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
564	21.56	22.39	142.00	1479.0	0.11100	0.11590	0.24390	0.13890	0.1726	0.05623	...	26.40	166.10	2027.0	0.14100	0.21130	0.4107	0.2216	0.2060	0.07115	1
565	20.13	28.25	131.20	1261.0	0.09780	0.10340	0.14400	0.09791	0.1752	0.05533	...	38.25	155.00	1731.0	0.11660	0.19220	0.3215	0.1628	0.2572	0.06637	1
566	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	0.1590	0.05648	...	34.12	126.70	1124.0	0.11390	0.30940	0.3403	0.1418	0.2218	0.07820	1
567	20.60	29.33	140.10	1265.0	0.11780	0.27700	0.35140	0.15200	0.2397	0.07016	...	39.42	184.60	1821.0	0.16500	0.86810	0.9387	0.2650	0.4087	0.12400	1
568	7.76	24.54	47.92	181.0	0.05263	0.04362	0.00000	0.00000	0.1587	0.05884	...	30.37	59.16	268.6	0.08996	0.06444	0.0000	0.0000	0.2871	0.07039	0